library(tidyverse)
library(lubridate)
library(caret)
library(nnet)
library(plotly)
library(randomForest)

options(browser = 'false')

1. Data wrangling

First, we will import the raw data, clean them, and merge them according to our analysis needs. Then, we will take the monthly average to incorporate other data.

1.1 Moore reef data from AIMS

# function to clean and wrangle AIMS Moore Reef data to deal with column names and column placement
clean_MooreReef_data <- function(df) {
    df[,2:ncol(df)] <- df[,1:ncol(df)]
    df[,1] <- rownames(df)
    df <- df %>% select(date, colnames(df)[ncol(df)])
    return(df)
}
moore_reef_water_temp_2.0m <- read.csv("raw_data/temp/AIMS_MooreReef_WaterTemperature_22Oct1997to16Feb2020_2.0m.csv", skip=108, sep= ",", row.names = NULL)
moore_reef_water_temp_9.0m <- read.csv("raw_data/temp/AIMS_MooreReef_WaterTemperature_22Oct1997to17Dec2017_9.0m.csv", skip=94, sep= ",", row.names = NULL)


# run through function defined above to clean and wrangle data
moore_reef_water_temp_2.0m <- clean_MooreReef_data(moore_reef_water_temp_2.0m)
## Warning in `[<-.data.frame`(`*tmp*`, , 2:ncol(df), value = structure(list(:
## provided 6 variables to replace 5 variables
moore_reef_water_temp_9.0m <- clean_MooreReef_data(moore_reef_water_temp_9.0m)
## Warning in `[<-.data.frame`(`*tmp*`, , 2:ncol(df), value = structure(list(:
## provided 6 variables to replace 5 variables
moore_reef_water_temp_2.0m <- moore_reef_water_temp_2.0m %>%
    filter(Water.Temperature..2.0m.MORFL1.Reef.Flat.Site.1_LEVEL2_value_AVG != "Not available")

moore_reef_water_temp_9.0m <- moore_reef_water_temp_9.0m %>%
    filter(Water.Temperature..9.0m.MORSL1.Reef.Slope.Site.1_LEVEL2_value_AVG != "Not available")
# merge AIMS Moore reef temperature data
aims_temp_data <- Reduce(function(x,y) merge(x,y, by="date"), list(moore_reef_water_temp_2.0m,
                                                              moore_reef_water_temp_9.0m))
                                                            
# convert water temp data from string to numeric type
aims_temp_data$Water.Temperature..2.0m.MORFL1.Reef.Flat.Site.1_LEVEL2_value_AVG <- 
    as.numeric(as.character(aims_temp_data$Water.Temperature..2.0m.MORFL1.Reef.Flat.Site.1_LEVEL2_value_AVG))

aims_temp_data$Water.Temperature..9.0m.MORSL1.Reef.Slope.Site.1_LEVEL2_value_AVG <- 
    as.numeric(as.character(aims_temp_data$Water.Temperature..9.0m.MORSL1.Reef.Slope.Site.1_LEVEL2_value_AVG))

# convert to date column into date type
aims_temp_data$date <- as.Date(aims_temp_data$date)

colnames(aims_temp_data) <- c("date", "avg_water_temp_2.0m_flat_site", "avg_water_temp_9.0m_slope_site")


# write combined AIMS Moore reef data
write.csv(aims_temp_data, "cleaned_data/aims_temperatures.csv", row.names = FALSE)

1.2 Coral cover data

# convert date-decimal in coral cover to YYYY-MM-DD format
coral_cover <- read.csv("raw_data/trendgbr-coral-cover-with-ci.csv")
coral_cover$Date <- as.Date(format(date_decimal(coral_cover$Date), "%Y-%m-%d"))

# rename "Date" to "date"
names(coral_cover)[names(coral_cover) == "Date"] <- "date"
colnames(coral_cover) <- c("date", "mean_live_coral_cover_percent", "lower_conf_int", "upper_conf_int", "conf_int_span")
write.csv(coral_cover, "cleaned_data/coral_cover.csv", row.names = FALSE)

1.3 Fish census data

fish_census <- read.csv("raw_data/Fish census 1992-2015.csv", sep="\t", header=T)

fish_census <- fish_census %>% select(gbifID, class, family, genus, species, verbatimScientificName, decimalLatitude, decimalLongitude, dateIdentified)
fish_census$dateIdentified <- ymd_hms(fish_census$dateIdentified)

Group fish census by date, then count how many unique fish species were observed on a given date

fish_species_counts <- fish_census %>%
    arrange(dateIdentified) %>%
    group_by(dateIdentified) %>%
    summarise(num_of_species=n_distinct(species))
## `summarise()` ungrouping output (override with `.groups` argument)
# rename "dateIdentified" to "date"
names(fish_species_counts)[names(fish_species_counts) == "dateIdentified"] <- "date"

fish_species_counts$date <- as.Date(fish_species_counts$date)

write.csv(fish_species_counts, "cleaned_data/fish_species_counts.csv", row.names = FALSE)

1.4 Sea grass data

We would like to see whether we can classify sea grass species based on the survey location (latitude, longitude, and depth below sea level), and the type of sediment and seabed in which they were discovered.

We decided not to use sea grass species belonging to the Halophila genus because they are so widespread and common in tropical waters. Since the halophila genus were also present in nearly all the survey sites where other sea grass species were found, we determined that it did not make much sense from a scientific perspective, as well as from the data set we were given.

seagrass_data <- read.csv("raw_data/GBR_NESP-TWQ-3.2.1-5.4_JCU_Seagrass_1984-2018_Site-surveys.csv") %>%
    # filter out for rows where we have information
    filter(SEDIMENT != "Not recorded" & PRESENCE_A == "Present") %>%

    # delete columns we are not going to use
    select(-FID, -MONTH, -YEAR, 
           -SURVEY_MET, -SURVEY_NAM, 
           # remove seagrass belonging to halophila genus
           -H_CAPRICOR, -H_TRICOSTA, -H_OVALIS, -H_UNINERVI, -H_DECIPIEN, -H_SPINULOS) 

Let’s examine how many presence/absence we see for each sea grass species:

table(seagrass_data$C_ROTUNDAT)
## 
##    No   Yes 
## 30366   187
table(seagrass_data$C_SERRULAT)
## 
##    No   Yes 
## 28899  1654
table(seagrass_data$E_ACOROIDE)
## 
##    No   Yes 
## 30494    59
table(seagrass_data$S_ISOETIFO)
## 
##    No   Yes 
## 30353   200
table(seagrass_data$T_CILIATUM)
## 
##    No 
## 30553
table(seagrass_data$T_HEMPRICH)
## 
##    No   Yes 
## 29674   879
table(seagrass_data$Z_CAPRICOR)
## 
##    No   Yes 
## 20074 10479

We see that there is actually no data at all for presence of T_CILIATUM. In addition, there are only 59 observations for E_ACOROIDE and 187 observations for C_ROTUNDAT that are actually useful in classifying these two species. Because we have a rather large data set, we want 200 or more observations for our classification model.So, we remove these columns and from our classification problem due to insufficient data.

seagrass_data <- seagrass_data %>% select(-C_ROTUNDAT, -T_CILIATUM, -E_ACOROIDE)

Since we deleted some columns, there may be some observations where all the remaining species columns have “No” values for absence. So, we filter out for rows where presence of at least one species was observed.

seagrass_data <- seagrass_data %>%
    filter(C_SERRULAT=="Yes" | 
           S_ISOETIFO=="Yes" |
           T_HEMPRICH=="Yes" |
           Z_CAPRICOR=="Yes")

Now we want to count how many species were found at each location site.

count_species_present <- function(C_SERRULAT, S_ISOETIFO, T_HEMPRICH, Z_CAPRICOR) {
    count = 0
    
    if (C_SERRULAT=="Yes") {
        count <- count + 1
    }
    if (S_ISOETIFO=="Yes") {
        count <- count + 1
    }
    if (T_HEMPRICH=="Yes") {
        count <- count + 1
    }
    if (Z_CAPRICOR=="Yes") {
        count <- count + 1
    }
    
    return(count)
}

seagrass_data$num_species_present <- mapply(count_species_present, 
                                            seagrass_data$C_SERRULAT,
                                            seagrass_data$S_ISOETIFO,
                                            seagrass_data$T_HEMPRICH,
                                            seagrass_data$Z_CAPRICOR)

How many survey sites had more than 1 species discovered?

table(seagrass_data$num_species_present)
## 
##     1     2     3 
## 12381   411     3
nrow(seagrass_data)
## [1] 12795

We see that only 3.2% of total survey sites recorded more than 1 species observed. For ease of building a classification model, we will remove these rows. We do not want a situation where our model cannot “decide” in classifying our observations. Because we removed only about 3% of over 12,500 observations, we still preserve statistical power.

seagrass_data <- seagrass_data %>% filter(num_species_present < 2)
table(seagrass_data$C_SERRULAT)
## 
##    No   Yes 
## 11125  1256
table(seagrass_data$S_ISOETIFO)
## 
##    No   Yes 
## 12280   101
table(seagrass_data$T_HEMPRICH)
## 
##    No   Yes 
## 11558   823
table(seagrass_data$Z_CAPRICOR)
## 
##    No   Yes 
##  2180 10201

So, after some data wrangling and exploratory analysis, we will build a model to classify 4 species of seagrass: Cymodocea Serrulata , Syringodium Isoetifolium, Thalassia Hemprichii, and Zostera Muelleri (subspecies Capricorni). Now, let’s build a SPECIES column that collects all the presence in a single variable.

# function to make a species column
get_species_type <- function(C_SERRULAT, S_ISOETIFO, T_HEMPRICH, Z_CAPRICOR) {
    if (C_SERRULAT=="Yes") {
        return("C_SERRULAT")
    } else if (S_ISOETIFO=="Yes") {
        return("S_ISOETIFO")
    } else if (T_HEMPRICH =="Yes") {
        return("T_HEMPRICH")
    } else if (Z_CAPRICOR =="Yes") {
        return("Z_CAPIRCOR")
    }
}

# build species column to classify species of each observation based on presence/absence
seagrass_data$SPECIES <- mapply(get_species_type, 
                                seagrass_data$C_SERRULAT, 
                                seagrass_data$S_ISOETIFO,
                                seagrass_data$T_HEMPRICH,
                                seagrass_data$Z_CAPRICOR)

table(seagrass_data$SPECIES)
## 
## C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR 
##       1256        101        823      10201

Data frame cleanup

# convert SPECIES to unordered factor
seagrass_data$SPECIES <- factor(seagrass_data$SPECIES, ordered=FALSE)

# rename misspelled column in original data set
names(seagrass_data)[names(seagrass_data) == "LATITUTDE"] <- "LATITUDE"

# only select columns relevant for our ML algorithm
seagrass_data <- seagrass_data %>% 
    select(SPECIES, LATITUDE, LONGITUDE, DEPTH, SEDIMENT, TIDAL)

# create negative depth for visualization purposes
seagrass_data$NEG_DEPTH <- -1 * seagrass_data$DEPTH

Write to .csv file.

write.csv(seagrass_data, "cleaned_data/seagrass_classification_data.csv", row.names = FALSE)




2. Regression and machine learning

2.1 Effect of sea water temperature on fish in the Great Barrier Reef

2.1.1 Exploratory data analysis and visualization

In this analysis, we will examine the effects of water temperature on the number of unique fish species observed in the Great Barrier Reef from 1997 to 2011.

Originally, we wanted to examine a relationship between water temperature and coral cover. However, we quickly saw that, when visualized, there does not seem to be a linear relation and we did not want to fit a mis-specified model.

Merge between temperature and coral cover data sets

temp <- read_csv("cleaned_data/aims_temperatures.csv")
## Parsed with column specification:
## cols(
##   date = col_date(format = ""),
##   avg_water_temp_2.0m_flat_site = col_double(),
##   avg_water_temp_9.0m_slope_site = col_double()
## )
coral_cover <- read_csv("cleaned_data/coral_cover.csv")
## Parsed with column specification:
## cols(
##   date = col_date(format = ""),
##   mean_live_coral_cover_percent = col_double(),
##   lower_conf_int = col_double(),
##   upper_conf_int = col_double(),
##   conf_int_span = col_double()
## )
temp_coral_cover <- merge(x=temp, y=coral_cover, by="date")
temp_coral_2m <- temp_coral_cover %>%
    ggplot(aes(avg_water_temp_2.0m_flat_site, mean_live_coral_cover_percent)) + 
    geom_point() +
    xlab("Avg water temperature at 2.0m (°C)") +
    ylab("Mean coral cover percentage")

temp_coral_9m <- temp_coral_cover %>%
    ggplot(aes(avg_water_temp_9.0m_slope_site, mean_live_coral_cover_percent)) + 
    geom_point() +
    xlab("Avg water temperature at 9.0m (°C)") +
    ylab("Mean coral cover percentage")

# display and save plots
temp_coral_2m

ggsave("viz/temp_coral_2m.png", plot=temp_coral_2m)
## Saving 7 x 5 in image
temp_coral_9m

ggsave("viz/temp_coral_9m.png", plot=temp_coral_9m)
## Saving 7 x 5 in image

As we can see, there is no linear relationship. While there are 2 or 3 clusters of coral cover percentage values, they seem to be pretty consistent across the range of water temperatures at 2.0m and 9.0 below sea level.

We also suspect that increase in water temperature could also affect diversity in sea life. So, we will examine the relationship between water temperature and the number of unique species of fish observed in the Great Barrier Ree. First, we can do some basic visualization by plotting the relationship between water temperatures at depth 2.0m and 9.0m with the number of unique fish species observedin the Great Barrier Reef.

fish <- read_csv("cleaned_data/fish_species_counts.csv")
## Parsed with column specification:
## cols(
##   date = col_date(format = ""),
##   num_of_species = col_double()
## )
fish_temp <- merge(x=temp, y=fish, by="date")

# water temp at 2.0m
fish_temp_2m <- fish_temp %>% ggplot(aes(avg_water_temp_2.0m_flat_site, num_of_species)) +
  geom_point() +
  xlab("Avg water temperature at 2.0m (°C)") +
  ylab("Num. of unique fish species")

# water temp at92.0m
fish_temp_9m <- fish_temp %>% ggplot(aes(avg_water_temp_9.0m_slope_site, num_of_species)) + 
  geom_point() +
  xlab("Avg water temperature at 9.0m (°C)") +
  ylab("Num. of unique fish species")

# display and save plots
fish_temp_2m

ggsave("viz/fish_temp_2m.png", plot=fish_temp_2m)
## Saving 7 x 5 in image
fish_temp_9m

ggsave("viz/fish_temp_9m.png", plot=fish_temp_9m)
## Saving 7 x 5 in image

There seems to be a bit of negative linear relationship going on, so we will fit a linear model examining the number of unique species discovered in relation to rising temperature.

2.1.2 Linear regression

Now, we can split our data set into train and test sets, using 0.6 to partition our data. Our outcome is the mean coral cover percentage and water temperature is our covariate. We will fit 2 linear regression models: one examining effect of water temperature at 2.0m and the other examining the effect of temperature at 9.0m.

train_index <- createDataPartition(y=fish_temp$num_of_species, times=1, p = 0.6, list=FALSE)

train_set <- fish_temp[train_index, ]
test_set <- fish_temp[-train_index, ]

Fit linear regression model:

fish_temp_2.0m <- lm(num_of_species ~ avg_water_temp_2.0m_flat_site, data=train_set)
summary(fish_temp_2.0m)
## 
## Call:
## lm(formula = num_of_species ~ avg_water_temp_2.0m_flat_site, 
##     data = train_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.736  -9.756   0.550   9.425  38.614 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   130.2206    17.3808   7.492 2.05e-12 ***
## avg_water_temp_2.0m_flat_site  -2.4675     0.6342  -3.891 0.000135 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.2 on 203 degrees of freedom
## Multiple R-squared:  0.0694, Adjusted R-squared:  0.06481 
## F-statistic: 15.14 on 1 and 203 DF,  p-value: 0.0001354
fish_temp_9.0m <- lm(num_of_species ~ avg_water_temp_9.0m_slope_site, data=train_set)
summary(fish_temp_9.0m)
## 
## Call:
## lm(formula = num_of_species ~ avg_water_temp_9.0m_slope_site, 
##     data = train_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.851  -9.798   0.328   9.725  39.067 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     133.942     17.657   7.586 1.17e-12 ***
## avg_water_temp_9.0m_slope_site   -2.614      0.647  -4.041 7.56e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.16 on 203 degrees of freedom
## Multiple R-squared:  0.07445,    Adjusted R-squared:  0.06989 
## F-statistic: 16.33 on 1 and 203 DF,  p-value: 7.556e-05

We see that the models are very similar in results. The coefficient with covariate 2.0m water temperature and 9.0 water temperature is -2.56 and -2.61, respectively.

Although we should not expect a major difference between how each of the two models performs, let’s compare them anyways to assess which water temperature depth is a better predictor of unique fish species observed in the Great Barrier Reef.

pred_2.0m <- predict(fish_temp_2.0m, test_set)
pred_9.0m <- predict(fish_temp_9.0m, test_set)

postResample(pred = pred_2.0m, obs = test_set$num_of_species)
##        RMSE    Rsquared         MAE 
## 15.15014383  0.08097776 11.25997201
postResample(pred = pred_9.0m, obs = test_set$num_of_species)
##        RMSE    Rsquared         MAE 
## 15.15118836  0.08046278 11.24961274

As expected, results are very similar.

We can assess this visually to confirm our results.

# water temp at 2.0m
fitted_fish_2m <- test_set %>% 
    ggplot(aes(avg_water_temp_2.0m_flat_site, num_of_species)) + 
    geom_point() +
    geom_abline(intercept=fish_temp_2.0m$coefficients[1], slope=fish_temp_2.0m$coefficients[2], col="red") +
    xlab("Avg water temperature at 2.0m (°C)") +
    ylab("Num. of unique fish species")

# water temp at 9.0m
fitted_fish_9m <- test_set %>% 
    ggplot(aes(avg_water_temp_9.0m_slope_site, num_of_species)) + 
    geom_point() +
    geom_abline(intercept=fish_temp_9.0m$coefficients[1], slope=fish_temp_9.0m$coefficients[2], col="blue") +
    xlab("Avg water temperature at 9.0m (°C)") +
    ylab("Num. of unique fish species")

# display and save plots
fitted_fish_2m

ggsave("viz/fitted_fish_2m.png", plot=fitted_fish_2m)
## Saving 7 x 5 in image
fitted_fish_9m

ggsave("viz/fitted_fish_9m.png", plot=fitted_fish_9m)
## Saving 7 x 5 in image

They both perform very similarly, and choosing either water temperature as our predictor will yield similar results.



2.2 Classification of sea grass species in the Great Barrier Reef from 1999 - 2003

Continuing on from sea life diversity, we have data on presence or absence of certain seagrass species in the Great Barrier Reef from 1999 to 2003. Let’s try to build a classifier to determine how location, and types of sediment and seabed may predict presence of certain sea grass.

As written in the data wrangling and cleaning RMarkdown/HTML file, we chose 4 species to classify: Cymodocea Serrulata, Syringodium Isoetifolium, Thalassia Hemprichii, and Zostera Muelleri (subspecies Capricorni).

2.2.1 Exploratory data analysis and visualization

seagrass <- read.csv("cleaned_data/seagrass_classification_data.csv", as.is =TRUE)

seagrass$SPECIES <- as.factor(seagrass$SPECIES)
seagrass$SEDIMENT <- as.factor(seagrass$SEDIMENT)
seagrass$TIDAL <- as.factor(seagrass$TIDAL)

head(seagrass)
##      SPECIES  LATITUDE LONGITUDE DEPTH SEDIMENT      TIDAL NEG_DEPTH
## 1 Z_CAPIRCOR -23.65857  151.1206     0      Mud intertidal         0
## 2 Z_CAPIRCOR -23.65840  151.1212     0      Mud intertidal         0
## 3 Z_CAPIRCOR -23.65898  151.1203     0      Mud intertidal         0
## 4 Z_CAPIRCOR -23.65970  151.1197     0      Mud intertidal         0
## 5 Z_CAPIRCOR -23.65884  151.1205     0      Mud intertidal         0
## 6 Z_CAPIRCOR -23.65993  151.1197     0      Mud intertidal         0

Here are the summary statistics of the sea grass data we cleaned and wrangled.

summary(seagrass)
##        SPECIES         LATITUDE        LONGITUDE         DEPTH        
##  C_SERRULAT: 1256   Min.   :-24.20   Min.   :142.5   Min.   : 0.0000  
##  S_ISOETIFO:  101   1st Qu.:-23.79   1st Qu.:146.8   1st Qu.: 0.0000  
##  T_HEMPRICH:  823   Median :-22.41   Median :150.5   Median : 0.0000  
##  Z_CAPIRCOR:10201   Mean   :-21.06   Mean   :149.0   Mean   : 0.9253  
##                     3rd Qu.:-19.17   3rd Qu.:151.3   3rd Qu.: 1.2279  
##                     Max.   :-10.75   Max.   :151.9   Max.   :29.3583  
##                                                                       
##    SEDIMENT           TIDAL        NEG_DEPTH       
##  Gravel:   1   intertidal:7433   Min.   :-29.3583  
##  Mud   :8969   subtidal  :4948   1st Qu.: -1.2279  
##  Reef  :  63                     Median :  0.0000  
##  Rock  :  66                     Mean   : -0.9253  
##  Rubble: 107                     3rd Qu.:  0.0000  
##  Sand  :3151                     Max.   :  0.0000  
##  Shell :  24

First we plot sea grass according to location (latitude and longitude). Then we will add a third axis (depth) to visualize this in 3-dimensions using plotly package. Since depth is measured in meters below sea level, we visualize this in negative values.

source("config.R")

Sys.setenv("plotly_username"=username)
Sys.setenv("plotly_api_key"=api_key)
seagrass_data_viz_2d <- seagrass %>%  ggplot() + 
  geom_point(aes(x=LATITUDE, y=LONGITUDE, color=SPECIES)) + 
  ggtitle("Seagrass present in the Great Barrier Reef, 1999 - 2003")

seagrass_data_viz_2d

ggsave("viz/seagrass_data_viz_2d.png", plot=seagrass_data_viz_2d)
## Saving 7 x 5 in image
plotly_3d <- plot_ly(seagrass, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~SPECIES, type="scatter3d", mode="markers") %>%
  layout(title = "Seagrass present in the Great Barrier Reef, 1999 - 2003")

#api_create(plotly_3d, filename = "seagrass_exp_data_viz")

plotly_3d
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

We can also see how our categorical predictors relate to sea grass species.

seagrass_sediment <- seagrass %>% 
    ggplot(aes(SEDIMENT, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")

seagrass_tidal <- seagrass %>% 
    ggplot(aes(TIDAL, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")

seagrass_sediment

ggsave("viz/seagrass_sediment.png", plot=seagrass_sediment)
## Saving 7 x 5 in image
seagrass_tidal

ggsave("viz/seagrass_tidal.png", plot=seagrass_tidal)
## Saving 7 x 5 in image

2.2.2 Random forest

We will first use random forest to build a classifier and then use a multinomial logistic regression model, and compare the two.

Let’s first partition our data set into train and test sets. Since we have a lot more data here than in the linear regression model, we will partition it by 75%-25%.

# test-train split
seagrass_train_ind <- createDataPartition(y = seagrass$SPECIES, p=0.75, list=FALSE)

train_set <- seagrass[seagrass_train_ind, ]
test_set <- seagrass[-seagrass_train_ind, ]
rf_fit <- randomForest(SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT + TIDAL, 
                       data=train_set,
                       mtry = 2)
rf_fit
## 
## Call:
##  randomForest(formula = SPECIES ~ LATITUDE + LONGITUDE + DEPTH +      SEDIMENT + TIDAL, data = train_set, mtry = 2) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4.78%
## Confusion matrix:
##            C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR class.error
## C_SERRULAT        702         21         13        206  0.25477707
## S_ISOETIFO         32         25          4         15  0.67105263
## T_HEMPRICH         48          1        562          7  0.09061489
## Z_CAPIRCOR         88          4          5       7554  0.01267808
rf_pred <- predict(rf_fit, newdata = test_set, type = "response")
confusionMatrix(table(pred = rf_pred, true = test_set$SPECIES))
## Confusion Matrix and Statistics
## 
##             true
## pred         C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR
##   C_SERRULAT        246          7         14         24
##   S_ISOETIFO          6         13          2          1
##   T_HEMPRICH          1          0        179          0
##   Z_CAPIRCOR         61          5         10       2525
## 
## Overall Statistics
##                                         
##                Accuracy : 0.9577        
##                  95% CI : (0.95, 0.9645)
##     No Information Rate : 0.8242        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.8558        
##                                         
##  Mcnemar's Test P-Value : 1.744e-07     
## 
## Statistics by Class:
## 
##                      Class: C_SERRULAT Class: S_ISOETIFO Class: T_HEMPRICH
## Sensitivity                    0.78344          0.520000           0.87317
## Specificity                    0.98381          0.997067           0.99965
## Pos Pred Value                 0.84536          0.590909           0.99444
## Neg Pred Value                 0.97574          0.996094           0.99108
## Prevalence                     0.10149          0.008080           0.06626
## Detection Rate                 0.07951          0.004202           0.05785
## Detection Prevalence           0.09405          0.007111           0.05818
## Balanced Accuracy              0.88363          0.758534           0.93641
##                      Class: Z_CAPIRCOR
## Sensitivity                     0.9902
## Specificity                     0.8603
## Pos Pred Value                  0.9708
## Neg Pred Value                  0.9493
## Prevalence                      0.8242
## Detection Rate                  0.8161
## Detection Prevalence            0.8407
## Balanced Accuracy               0.9252

We see that our classification model works quite well, especially for T_HEMPRICH and Z_CAPRICOR, which have 85%+ sensitivity and specificity. However, we got quite a low sensitivity for S_ISOETIFO. Recall to our data wrangling portion that S_ISOETIFO had only about 100 “Yes” observations. Since we had a small sample size for S_ISOETIFO relative to the other 3 seagrass species, this may have contributed to the low sensitivity.

We can visually assess our predicted values with true values of species to see how our model performed.

# true values
true_class <- plot_ly(test_set, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~SPECIES, type="scatter3d", mode="markers") %>%
  layout(title = "True seagrass classification in test set")

true_class
#api_create(true_class, filename = "seagrass_true_classification")

# predicted values
predicted_class <- plot_ly(test_set, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~rf_pred, type="scatter3d", mode="markers") %>%
  layout(title = "Predicted seagrass classification in test set")

predicted_class
#api_create(predicted_class, filename = "seagrass_pred_classification")

For sediment type:

true_rf_sediment <- test_set %>% 
    ggplot(aes(SEDIMENT, fill=SPECIES)) + geom_bar(width=.5, position = "dodge") + labs(fill="True species")

pred_rf_sediment <- test_set %>% 
    ggplot(aes(SEDIMENT, fill=rf_pred)) + geom_bar(width=.5, position = "dodge") + labs(fill="Predicted species")


true_rf_sediment

ggsave("viz/true_rf_sediment.png", plot=true_rf_sediment)
## Saving 7 x 5 in image
pred_rf_sediment

ggsave("viz/pred_rf_sediment.png", plot=pred_rf_sediment)
## Saving 7 x 5 in image

For seabed type:

true_rf_tidal <- test_set %>% 
    ggplot(aes(TIDAL, fill=SPECIES)) + geom_bar(width=.5, position = "dodge") + labs(fill="True species")

pred_rf_tidal <- test_set %>% 
    ggplot(aes(TIDAL, fill=rf_pred)) + geom_bar(width=.5, position = "dodge") + labs(fill="Predicted species")

true_rf_tidal

ggsave("viz/true_rf_tidal.png", plot=true_rf_tidal)
## Saving 7 x 5 in image
pred_rf_tidal

ggsave("viz/pred_rf_tidal.png", plot=pred_rf_tidal)
## Saving 7 x 5 in image

Let’s examine variable importance.

variable_importance <- importance(rf_fit)
tmp <- data_frame(feature = rownames(variable_importance),
                  Gini = variable_importance[,1]) %>%
                  arrange(desc(Gini))
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
tmp
## # A tibble: 5 x 2
##   feature    Gini
##   <chr>     <dbl>
## 1 LATITUDE  998. 
## 2 LONGITUDE 815. 
## 3 DEPTH     609. 
## 4 TIDAL     129. 
## 5 SEDIMENT   75.6

We see that longitude and latitude were very predictive of presence of seagrass followed by depth from sea level. The types of sediment and seabed (intertidal or subtidal seabed) are not very good predictors. Thus, it seems that the location of where the sea grass was discovered matters more than the various ocean floor properties.

tmp %>% ggplot(aes(x=reorder(feature, Gini), y=Gini)) + 
  geom_bar(stat='identity') +
  coord_flip() + xlab("Feature") +
  theme(axis.text=element_text(size=8))

2.2.3 Multinomial logistic regression

We can now try a multinomial logistic regression model to see how it compares to random forest. We will use the nnet package.

The logistic regression model is as follows:

multinom_fit <- multinom(SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT, data=train_set)
## # weights:  44 (30 variable)
## initial  value 12874.515732 
## iter  10 value 3270.716174
## iter  20 value 2601.142207
## iter  30 value 2489.683000
## iter  40 value 2433.518856
## iter  50 value 2416.481441
## iter  60 value 2385.203109
## iter  70 value 2383.328069
## iter  80 value 2383.236112
## iter  90 value 2382.659333
## iter 100 value 2382.411013
## final  value 2382.411013 
## stopped after 100 iterations
summary(multinom_fit)
## Warning in sqrt(diag(vc)): NaNs produced
## Call:
## multinom(formula = SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT, 
##     data = train_set)
## 
## Coefficients:
##            (Intercept)  LATITUDE LONGITUDE       DEPTH SEDIMENTMud SEDIMENTReef
## S_ISOETIFO  -382.40287 2.0666240 2.6715999  0.02457381   24.940188     25.83904
## T_HEMPRICH   -52.58375 1.5257042 0.4809432 -0.34456900    6.755413     10.08393
## Z_CAPIRCOR  -291.21628 0.6769761 1.4751511 -0.77693752   89.918927     26.39204
##            SEDIMENTRock SEDIMENTRubble SEDIMENTSand SEDIMENTShell
## S_ISOETIFO     26.21524      -24.49808    25.602863      26.55965
## T_HEMPRICH     10.90509       11.41330     9.928641      10.04138
## Z_CAPIRCOR     87.31451       85.69719    89.064339      88.54503
## 
## Std. Errors:
##             (Intercept)   LATITUDE   LONGITUDE      DEPTH SEDIMENTMud
## S_ISOETIFO 0.0024742777 0.05772111 0.007798700 0.02456075   0.3746218
## T_HEMPRICH 0.0004550318 0.06191391 0.007840122 0.04190867   0.2940471
## Z_CAPIRCOR 0.0020779419 0.02873447 0.004627660 0.02901677   0.3547277
##            SEDIMENTReef SEDIMENTRock SEDIMENTRubble SEDIMENTSand SEDIMENTShell
## S_ISOETIFO 6.199484e-01    0.7618304            NaN    0.3504961     0.9465096
## T_HEMPRICH 4.775093e-01    0.6031441      0.4402797    0.2603755     0.8929737
## Z_CAPIRCOR 5.077246e-26    0.6825874      1.0330768    0.3562248     0.8203911
## 
## Residual Deviance: 4764.822 
## AIC: 4824.822

Relative risk ratios where reference group is C_SERRULAT

exp(coef(multinom_fit))
##              (Intercept) LATITUDE LONGITUDE     DEPTH  SEDIMENTMud SEDIMENTReef
## S_ISOETIFO 8.405111e-167 7.898114 14.463091 1.0248782 6.782440e+10 1.666292e+11
## T_HEMPRICH  1.456022e-23 4.598381  1.617599 0.7085257 8.586942e+02 2.395484e+04
## Z_CAPIRCOR 3.360287e-127 1.967918  4.371696 0.4598120 1.125366e+39 2.896801e+11
##            SEDIMENTRock SEDIMENTRubble SEDIMENTSand SEDIMENTShell
## S_ISOETIFO 2.427350e+11   2.294145e-11 1.315777e+11  3.425384e+11
## T_HEMPRICH 5.445261e+04   9.051767e+04 2.050945e+04  2.295715e+04
## Z_CAPIRCOR 8.321628e+37   1.651272e+37 4.787965e+38  2.848501e+38
# predicted probabilities
predicted_prob <- predict(multinom_fit, newdata=test_set, type="probs")

# predicted classes
predicted_class <- predict(multinom_fit, newdata=test_set, type="class")

confusionMatrix(data = predicted_class, reference = test_set$SPECIES )
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR
##   C_SERRULAT        134          8          9         24
##   S_ISOETIFO          1          0          0          2
##   T_HEMPRICH         18          3        174         18
##   Z_CAPIRCOR        161         14         22       2506
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9095          
##                  95% CI : (0.8988, 0.9194)
##     No Information Rate : 0.8242          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6644          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: C_SERRULAT Class: S_ISOETIFO Class: T_HEMPRICH
## Sensitivity                    0.42675         0.0000000           0.84878
## Specificity                    0.98525         0.9990225           0.98650
## Pos Pred Value                 0.76571         0.0000000           0.81690
## Neg Pred Value                 0.93834         0.9919120           0.98924
## Prevalence                     0.10149         0.0080802           0.06626
## Detection Rate                 0.04331         0.0000000           0.05624
## Detection Prevalence           0.05656         0.0009696           0.06884
## Balanced Accuracy              0.70600         0.4995112           0.91764
##                      Class: Z_CAPIRCOR
## Sensitivity                     0.9827
## Specificity                     0.6379
## Pos Pred Value                  0.9271
## Neg Pred Value                  0.8875
## Prevalence                      0.8242
## Detection Rate                  0.8100
## Detection Prevalence            0.8736
## Balanced Accuracy               0.8103

We see that our multinomial logistic model has about 91% overall accuracy, which performs a bit worse than random forest. However, the model performs very badly in predicting for S_ISOETIFO.

The model seems to predict T_HEMPRICH the best with 88.7% sensitivity and 98.8% specificity. The model also does not perform well for sensitivity of C_SERRULAT, with only about 41.7% sensitivity.

So, we see that the overall accuracy for multinomial logistic regression model vs random forest model was 95.6% and 90.9%, respectively. However, the overall accuracy stated for the logistic regression model is deceiving since it did not perform well in sensitivity in 2 out of 4 classes.